In [2]:
repo_pth = '../repos/'
import os
import sys
from datetime import datetime
import numpy as np
import pandas
from matplotlib import pyplot as plt
import sqlite3

add_paths = [repo_pth+'rfcx-worker-analysis/modules/domain_modules', repo_pth+'notebook-display']
for p in add_paths:
    if p not in sys.path:    
        sys.path.append(p)

import spectral_analysis
import load_sound
import fingerprinting

from IPython.html.widgets import interactive, Checkbox, interact
from IPython.display import display, HTML
from IPython.html import widgets

import nbio


loaded nbio
:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [3]:
reload(load_sound)
reload(spectral_analysis)
reload(fingerprinting)
reload(nbio)
show = nbio.show
read_sound = load_sound.read_sound
Sound, Spectrum = load_sound.Sound, spectral_analysis.Spectrum
Profile = fingerprinting.Profile
read_sound = load_sound.read_sound
write_sound = load_sound.write_sound
def play(snd): nbio.play(snd.data, snd.samplerate)


loaded nbio

In [4]:
resource_pth = '../../opt/rfcx-data/'
resource_pth = '../resources/'
event_fn = resource_pth+'events.tsv'
data_pth = resource_pth+'wav/'

in_dir = sorted(os.listdir(data_pth))
print '%s files found in %s' % (len(set(in_dir)), data_pth)


459 files found in ../resources/wav/

Read tsv file


In [5]:
def parse_seek(x):
    m,s = map(int,x.split(':'))
    return m*60+s
    
df = pandas.io.parsers.read_csv(
    event_fn, 
    delim_whitespace=True,
    parse_dates = ['time'],
    infer_datetime_format=True,
)

Filter out those for which we don't have a file


In [1]:
in_file = list(df.location)
df['has_file'] = np.in1d(df.location, in_dir)
df = df.groupby('has_file').get_group(True).copy()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-fee024536760> in <module>()
----> 1 in_file = list(df.location)
      2 df['has_file'] = np.in1d(df.location, in_dir)
      3 df = df.groupby('has_file').get_group(True).copy()

NameError: name 'df' is not defined

Add some columns


In [7]:
df['seek_sec'] = map(parse_seek, df.seek)
df['start_seek'] =  df['seek_sec']-5
df['stop_seek'] =  df['seek_sec']+10

In [12]:
df


Out[12]:
type id guardian time unixtime weekday weekday_num hour day_night seek location has_file seek_sec start_seek stop_seek
0 car 48043ef7e3bc-2015-01-05T09-38-08 48043ef7e3bc 2015-01-05 09:38:55 1420450735 Mon 1 10 day 0:47 48043ef7e3bc-2015-01-05T09-38-08.wav True 47 42 57
1 car 48043ef7e3bc-2015-01-24T03-40-13 48043ef7e3bc 2015-01-24 03:43:45 1422071025 Sat 6 4 night 3:32 48043ef7e3bc-2015-01-24T03-40-13.wav True 212 207 222
2 car 48043ef7e3bc-2015-01-27T15-03-45 48043ef7e3bc 2015-01-27 15:03:57 1422371037 Tue 2 16 day 0:12 48043ef7e3bc-2015-01-27T15-03-45.wav True 12 7 22
3 car 48043ef7e3bc-2015-01-27T15-15-47 48043ef7e3bc 2015-01-27 15:17:32 1422371852 Tue 2 16 day 1:45 48043ef7e3bc-2015-01-27T15-15-47.wav True 105 100 115
4 car 48043ef7e3bc-2015-01-27T15-15-47 48043ef7e3bc 2015-01-27 15:18:08 1422371888 Tue 2 16 day 2:21 48043ef7e3bc-2015-01-27T15-15-47.wav True 141 136 151
5 car 48043ef7e3bc-2015-01-27T15-27-49 48043ef7e3bc 2015-01-27 15:28:18 1422372498 Tue 2 16 day 0:29 48043ef7e3bc-2015-01-27T15-27-49.wav True 29 24 39
6 car 48043ef7e3bc-2015-01-27T15-27-49 48043ef7e3bc 2015-01-27 15:28:22 1422372502 Tue 2 16 day 0:33 48043ef7e3bc-2015-01-27T15-27-49.wav True 33 28 43
7 car 48043ef7e3bc-2015-01-27T15-27-49 48043ef7e3bc 2015-01-27 15:30:36 1422372636 Tue 2 16 day 2:47 48043ef7e3bc-2015-01-27T15-27-49.wav True 167 162 177
8 car 48043ef7e3bc-2015-01-27T15-27-49 48043ef7e3bc 2015-01-27 15:33:31 1422372811 Tue 2 16 day 5:42 48043ef7e3bc-2015-01-27T15-27-49.wav True 342 337 352
9 car 48043ef7e3bc-2015-01-27T15-33-50 48043ef7e3bc 2015-01-27 15:34:39 1422372879 Tue 2 16 day 0:49 48043ef7e3bc-2015-01-27T15-33-50.wav True 49 44 59
10 car 48043ef7e3bc-2015-01-28T14-52-26 48043ef7e3bc 2015-01-28 14:57:48 1422457068 Wed 3 15 day 5:22 48043ef7e3bc-2015-01-28T14-52-26.wav True 322 317 332
11 car 48043ef7e3bc-2015-01-28T18-58-57 48043ef7e3bc 2015-01-28 19:04:54 1422471894 Wed 3 20 night 5:57 48043ef7e3bc-2015-01-28T18-58-57.wav True 357 352 367
12 car 48043ef7e3bc-2015-01-28T19-16-59 48043ef7e3bc 2015-01-28 19:17:09 1422472629 Wed 3 20 night 0:10 48043ef7e3bc-2015-01-28T19-16-59.wav True 10 5 20
13 car 48043ef7e3bc-2015-01-28T19-22-59 48043ef7e3bc 2015-01-28 19:28:16 1422473296 Wed 3 20 night 5:17 48043ef7e3bc-2015-01-28T19-22-59.wav True 317 312 327
14 car 48043ef7e3bc-2015-01-28T19-35-01 48043ef7e3bc 2015-01-28 19:39:58 1422473998 Wed 3 20 night 4:57 48043ef7e3bc-2015-01-28T19-35-01.wav True 297 292 307
15 car 48043ef7e3bc-2015-01-28T21-17-13 48043ef7e3bc 2015-01-28 21:19:10 1422479950 Wed 3 22 night 1:57 48043ef7e3bc-2015-01-28T21-17-13.wav True 117 112 127
16 car ed7f84df28eb-2015-02-07T10-48-49 ed7f84df28eb 2015-02-07 10:51:43 1423306303 Sat 6 11 day 2:54 ed7f84df28eb-2015-02-07T10-48-49.wav True 174 169 184
17 car ed7f84df28eb-2015-02-07T13-49-11 ed7f84df28eb 2015-02-07 13:49:54 1423316994 Sat 6 14 day 0:43 ed7f84df28eb-2015-02-07T13-49-11.wav True 43 38 53
18 car ed7f84df28eb-2015-02-08T06-03-11 ed7f84df28eb 2015-02-08 06:03:46 1423375426 Sun 7 7 day 0:35 ed7f84df28eb-2015-02-08T06-03-11.wav True 35 30 45
19 car ed7f84df28eb-2015-02-08T06-09-12 ed7f84df28eb 2015-02-08 06:11:25 1423375885 Sun 7 7 day 2:13 ed7f84df28eb-2015-02-08T06-09-12.wav True 133 128 143
20 car b0eb9c751fa5-2015-02-13T11-53-56 b0eb9c751fa5 2015-02-13 11:55:17 1423828517 Fri 5 12 day 1:21 b0eb9c751fa5-2015-02-13T11-53-56.wav True 81 76 91
21 car b0eb9c751fa5-2015-02-13T11-53-56 b0eb9c751fa5 2015-02-13 11:56:19 1423828579 Fri 5 12 day 2:23 b0eb9c751fa5-2015-02-13T11-53-56.wav True 143 138 153
22 car b0eb9c751fa5-2015-02-13T11-53-56 b0eb9c751fa5 2015-02-13 11:59:08 1423828748 Fri 5 12 day 5:12 b0eb9c751fa5-2015-02-13T11-53-56.wav True 312 307 322
23 car b0eb9c751fa5-2015-02-13T12-17-59 b0eb9c751fa5 2015-02-13 12:19:17 1423829957 Fri 5 13 day 1:18 b0eb9c751fa5-2015-02-13T12-17-59.wav True 78 73 88
24 car b0eb9c751fa5-2015-02-13T12-17-59 b0eb9c751fa5 2015-02-13 12:20:30 1423830030 Fri 5 13 day 2:31 b0eb9c751fa5-2015-02-13T12-17-59.wav True 151 146 161
25 car b0eb9c751fa5-2015-02-13T12-30-00 b0eb9c751fa5 2015-02-13 12:30:26 1423830626 Fri 5 13 day 0:26 b0eb9c751fa5-2015-02-13T12-30-00.wav True 26 21 36
26 car 6a7b86c28a67-2015-02-13T13-24-30 6a7b86c28a67 2015-02-13 13:30:09 1423834209 Fri 5 14 day 5:39 6a7b86c28a67-2015-02-13T13-24-30.wav True 339 334 349
27 car 6a7b86c28a67-2015-02-13T16-07-02 6a7b86c28a67 2015-02-13 16:11:54 1423843914 Fri 5 17 day 4:52 6a7b86c28a67-2015-02-13T16-07-02.wav True 292 287 302
28 car 6a7b86c28a67-2015-02-13T16-19-04 6a7b86c28a67 2015-02-13 16:19:45 1423844385 Fri 5 17 day 0:41 6a7b86c28a67-2015-02-13T16-19-04.wav True 41 36 51
29 car 6a7b86c28a67-2015-02-14T14-53-37 6a7b86c28a67 2015-02-14 14:54:06 1423925646 Sat 6 15 day 0:29 6a7b86c28a67-2015-02-14T14-53-37.wav True 29 24 39
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1963 truck 22720ac238e1-2015-03-02T12-12-43 22720ac238e1 2015-03-02 12:12:44 1425298364 Mon 1 13 day 0:01 22720ac238e1-2015-03-02T12-12-43.wav True 1 -4 11
1964 truck d2f3d1c7cca5-2015-03-02T12-12-12 d2f3d1c7cca5 2015-03-02 12:13:17 1425298397 Mon 1 13 day 1:05 d2f3d1c7cca5-2015-03-02T12-12-12.wav True 65 60 75
1965 truck 22720ac238e1-2015-03-02T12-12-43 22720ac238e1 2015-03-02 12:13:45 1425298425 Mon 1 13 day 1:02 22720ac238e1-2015-03-02T12-12-43.wav True 62 57 72
1966 truck 22720ac238e1-2015-03-02T12-15-45 22720ac238e1 2015-03-02 12:16:30 1425298590 Mon 1 13 day 0:45 22720ac238e1-2015-03-02T12-15-45.wav True 45 40 55
1967 truck 22720ac238e1-2015-03-02T12-15-45 22720ac238e1 2015-03-02 12:16:41 1425298601 Mon 1 13 day 0:56 22720ac238e1-2015-03-02T12-15-45.wav True 56 51 66
1968 truck 22720ac238e1-2015-03-02T12-15-45 22720ac238e1 2015-03-02 12:17:07 1425298627 Mon 1 13 day 1:22 22720ac238e1-2015-03-02T12-15-45.wav True 82 77 92
1969 truck 22720ac238e1-2015-03-02T12-17-16 22720ac238e1 2015-03-02 12:17:21 1425298641 Mon 1 13 day 0:05 22720ac238e1-2015-03-02T12-17-16.wav True 5 0 15
1970 truck 22720ac238e1-2015-03-02T12-17-16 22720ac238e1 2015-03-02 12:17:32 1425298652 Mon 1 13 day 0:16 22720ac238e1-2015-03-02T12-17-16.wav True 16 11 26
1971 truck 22720ac238e1-2015-03-02T12-55-11 22720ac238e1 2015-03-02 12:56:04 1425300964 Mon 1 13 day 0:53 22720ac238e1-2015-03-02T12-55-11.wav True 53 48 63
1972 truck 686862515160-2015-03-02T13-10-00 686862515160 2015-03-02 13:10:21 1425301821 Mon 1 14 day 0:21 686862515160-2015-03-02T13-10-00.wav True 21 16 31
1973 truck 686862515160-2015-03-02T13-16-02 686862515160 2015-03-02 13:16:44 1425302204 Mon 1 14 day 0:42 686862515160-2015-03-02T13-16-02.wav True 42 37 52
1974 truck d2f3d1c7cca5-2015-03-02T19-15-04 d2f3d1c7cca5 2015-03-02 19:16:26 1425323786 Mon 1 20 night 1:22 d2f3d1c7cca5-2015-03-02T19-15-04.wav True 82 77 92
1975 truck 686862515160-2015-03-03T09-47-46 686862515160 2015-03-03 09:49:31 1425376171 Tue 2 10 day 1:45 686862515160-2015-03-03T09-47-46.wav True 105 100 115
1976 truck 686862515160-2015-03-03T10-22-02 686862515160 2015-03-03 10:22:41 1425378161 Tue 2 11 day 0:39 686862515160-2015-03-03T10-22-02.wav True 39 34 49
1977 truck 686862515160-2015-03-03T12-22-51 686862515160 2015-03-03 12:23:48 1425385428 Tue 2 13 day 0:57 686862515160-2015-03-03T12-22-51.wav True 57 52 67
1978 truck d2f3d1c7cca5-2015-03-03T20-30-34 d2f3d1c7cca5 2015-03-03 20:31:24 1425414684 Tue 2 21 night 0:50 d2f3d1c7cca5-2015-03-03T20-30-34.wav True 50 45 60
1979 truck d2f3d1c7cca5-2015-03-03T20-47-12 d2f3d1c7cca5 2015-03-03 20:48:39 1425415719 Tue 2 21 night 1:27 d2f3d1c7cca5-2015-03-03T20-47-12.wav True 87 82 97
1980 truck d2f3d1c7cca5-2015-03-03T21-28-03 d2f3d1c7cca5 2015-03-03 21:29:26 1425418166 Tue 2 22 night 1:23 d2f3d1c7cca5-2015-03-03T21-28-03.wav True 83 78 93
1981 truck d2f3d1c7cca5-2015-03-03T21-58-18 d2f3d1c7cca5 2015-03-03 21:59:21 1425419961 Tue 2 22 night 1:03 d2f3d1c7cca5-2015-03-03T21-58-18.wav True 63 58 73
1982 truck d2f3d1c7cca5-2015-03-04T13-05-03 d2f3d1c7cca5 2015-03-04 13:05:39 1425474339 Wed 3 14 day 0:36 d2f3d1c7cca5-2015-03-04T13-05-03.wav True 36 31 46
1983 truck d2f3d1c7cca5-2015-03-04T13-05-03 d2f3d1c7cca5 2015-03-04 13:05:46 1425474346 Wed 3 14 day 0:43 d2f3d1c7cca5-2015-03-04T13-05-03.wav True 43 38 53
1984 truck d2f3d1c7cca5-2015-03-04T13-06-34 d2f3d1c7cca5 2015-03-04 13:06:45 1425474405 Wed 3 14 day 0:11 d2f3d1c7cca5-2015-03-04T13-06-34.wav True 11 6 21
1985 truck d2f3d1c7cca5-2015-03-04T13-06-34 d2f3d1c7cca5 2015-03-04 13:07:04 1425474424 Wed 3 14 day 0:30 d2f3d1c7cca5-2015-03-04T13-06-34.wav True 30 25 40
1986 truck d2f3d1c7cca5-2015-03-04T13-06-34 d2f3d1c7cca5 2015-03-04 13:07:31 1425474451 Wed 3 14 day 0:57 d2f3d1c7cca5-2015-03-04T13-06-34.wav True 57 52 67
1987 truck d2f3d1c7cca5-2015-03-04T13-06-34 d2f3d1c7cca5 2015-03-04 13:07:44 1425474464 Wed 3 14 day 1:10 d2f3d1c7cca5-2015-03-04T13-06-34.wav True 70 65 80
1988 truck d2f3d1c7cca5-2015-03-04T13-06-34 d2f3d1c7cca5 2015-03-04 13:07:47 1425474467 Wed 3 14 day 1:13 d2f3d1c7cca5-2015-03-04T13-06-34.wav True 73 68 83
1989 truck d2f3d1c7cca5-2015-03-04T13-06-34 d2f3d1c7cca5 2015-03-04 13:07:50 1425474470 Wed 3 14 day 1:16 d2f3d1c7cca5-2015-03-04T13-06-34.wav True 76 71 86
1990 truck d2f3d1c7cca5-2015-03-04T13-40-32 d2f3d1c7cca5 2015-03-04 13:40:41 1425476441 Wed 3 14 day 0:09 d2f3d1c7cca5-2015-03-04T13-40-32.wav True 9 4 19
1991 truck d2f3d1c7cca5-2015-03-04T15-11-15 d2f3d1c7cca5 2015-03-04 15:12:15 1425481935 Wed 3 16 day 1:00 d2f3d1c7cca5-2015-03-04T15-11-15.wav True 60 55 70
1992 truck d2f3d1c7cca5-2015-03-04T19-31-31 d2f3d1c7cca5 2015-03-04 19:32:50 1425497570 Wed 3 20 night 1:19 d2f3d1c7cca5-2015-03-04T19-31-31.wav True 79 74 89

1993 rows × 15 columns


In [14]:
dt0 = df['time'].min()  #datetime(2011,1,1)
df['timestamp'] = [e.seconds for e in df['time']-dt0]

Create database to store results persistently during training


In [8]:
conn = sqlite3.connect('events_checked.db')
c = conn.cursor()

#c.execute("DROP TABLE event_checked")

#c.execute('''CREATE TABLE event_checked
#             (id text, checked bool, valid bool)''')
#for i,e in dt.iterrows():
#    eid = s.id+':'+str(s.seek_sec)
#    c.execute("INSERT into event_checked VALUES (?,0,0)", [eid])

This guardian had a high number of false positives and poor sound quality. Eliminate it from training


In [54]:
c.execute("UPDATE event_checked SET checked=1, valid=4 WHERE id LIKE '649c72d5aa6c%'")
conn.commit()


Out[54]:
<sqlite3.Cursor at 0x7f78753a1570>

Set up the GUI


In [8]:
#checked = np.memmap('checked.dat', mode='w+', shape=(len(events),))

opts = ['F','T','na']

def f(i, cb):
    #print 'you changed id',i,'to',cb.value
    #checked[i] = 
    valid = opts.index(cb.value)
    chkd = 1
    if valid==2: chkd=0
    #print chkd, valid, i
    c.execute("UPDATE event_checked SET checked=?, valid=? WHERE id=?", [chkd, valid, i])
    #conn.commit()

def factory(i,cb):
    def fx():
        return f(i,cb)
    return fx

In [9]:
events = [e for e in list(df.iterrows()) if e[1].has_file]

In [50]:
#checked.flush()
conn.commit()
#offset = np.argwhere(checked==0)[0][0]
c.execute("SELECT id FROM event_checked WHERE checked=0")
#print c.fetchone()
sid, ss = c.fetchone()[0].split(':')
sdf =  df[df['id']==sid]
edf = sdf[sdf['seek_sec']==int(ss)]
offset = list(edf.iterrows())[0][0]
#print offset
#offset = 1
count = 10
#read_sound(data_pth+loc, {}).crop(start,stop)
rows = []
maxn = max(len(events),offset+count )
for i,t in events[offset:maxn]:
    loc = t.location
    start, stop = t.start_seek, t.stop_seek
    snd = read_sound(data_pth+loc, {}).crop(start,stop)
    src = nbio.wavPlayer_html(snd.data, snd.samplerate)
    hsrc = widgets.HTML(src)
    rb = widgets.RadioButtons(options=opts)
    c.execute("SELECT checked,valid FROM event_checked WHERE id=?", [t.id+':'+str(t.seek_sec)])
    chkd, vald = c.fetchall()[0]
    if chkd==0: vald=2
    rb.value = opts[vald]
    txt = widgets.Text('%s, %s, %s-%s'%(i,t.id, start, stop))
    rows.append(widgets.HBox([rb, hsrc, txt]))
    rb.on_trait_change(factory(t.id+':'+str(t.seek_sec),rb))

container = widgets.Box(children=rows)
display(container)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-50-2f2ca3cde69f> in <module>()
      4 c.execute("SELECT id FROM event_checked WHERE checked=0")
      5 #print c.fetchone()
----> 6 sid, ss = c.fetchone()[0].split(':')
      7 sdf =  df[df['id']==sid]
      8 edf = sdf[sdf['seek_sec']==int(ss)]

TypeError: 'NoneType' object has no attribute '__getitem__'

In [55]:
conn.commit()

Transfer the training results to the dataframe


In [62]:
valids=[]
checks=[]
for i,t in list(df.iterrows()):
    ii = t.id+':'+str(t.seek_sec)
    c.execute("SELECT * from event_checked where id=?", [ii])
    e = c.fetchone()
    if e is None: 
        ck, va = 0,0
    else:
        ck, va = e[1:]
    checks.append(ck)
    valids.append(va)
df['checked'] = checks
df['valid'] = valids

In [211]:
grouped  = df.groupby(['guardian'])
ids = grouped.groups.keys()
gdf = grouped.get_group('48043ef7e3bc')
ids


Out[211]:
['ed7f84df28eb',
 '6a7b86c28a67',
 '686862515160',
 'd2f3d1c7cca5',
 '649c72d5aa6c',
 '48043ef7e3bc',
 '5dbdf80c2085',
 'b0eb9c751fa5',
 '22720ac238e1']

In [18]:
plt.clf()
plt.cla()
#a = gdf['timestamp'].plot(kind='hist', bins=100)
#print a
a = (df['timestamp']/10000.).hist(by=df['guardian'], bins=100)
nbio.show(plt.gcf())